import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
import plotly.graph_objects as go
import plotly.express as px
data = pd.read_csv('EnglandWeather.csv')
data
| Formatted Date | Summary | Precip Type | Temperature (C) | Wind Speed (km/h) | Pressure (millibars) | Humidity | |
|---|---|---|---|---|---|---|---|
| 0 | 2006-04-01 00:00:00.000 +0200 | Partly Cloudy | rain | 9.472222 | 14.1197 | 1015.13 | 0.89 |
| 1 | 2006-04-01 01:00:00.000 +0200 | Partly Cloudy | rain | 9.355556 | 14.2646 | 1015.63 | 0.86 |
| 2 | 2006-04-01 02:00:00.000 +0200 | Mostly Cloudy | rain | 9.377778 | 3.9284 | 1015.94 | 0.89 |
| 3 | 2006-04-01 03:00:00.000 +0200 | Partly Cloudy | rain | 8.288889 | 14.1036 | 1016.41 | 0.83 |
| 4 | 2006-04-01 04:00:00.000 +0200 | Mostly Cloudy | rain | 8.755556 | 11.0446 | 1016.51 | 0.83 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 96448 | 2016-09-09 19:00:00.000 +0200 | Partly Cloudy | rain | 26.016667 | 10.9963 | 1014.36 | 0.43 |
| 96449 | 2016-09-09 20:00:00.000 +0200 | Partly Cloudy | rain | 24.583333 | 10.0947 | 1015.16 | 0.48 |
| 96450 | 2016-09-09 21:00:00.000 +0200 | Partly Cloudy | rain | 22.038889 | 8.9838 | 1015.66 | 0.56 |
| 96451 | 2016-09-09 22:00:00.000 +0200 | Partly Cloudy | rain | 21.522222 | 10.5294 | 1015.95 | 0.60 |
| 96452 | 2016-09-09 23:00:00.000 +0200 | Partly Cloudy | rain | 20.438889 | 5.8765 | 1016.16 | 0.61 |
96453 rows × 7 columns
df = pd.DataFrame(data)
df
| Formatted Date | Summary | Precip Type | Temperature (C) | Wind Speed (km/h) | Pressure (millibars) | Humidity | |
|---|---|---|---|---|---|---|---|
| 0 | 2006-04-01 00:00:00.000 +0200 | Partly Cloudy | rain | 9.472222 | 14.1197 | 1015.13 | 0.89 |
| 1 | 2006-04-01 01:00:00.000 +0200 | Partly Cloudy | rain | 9.355556 | 14.2646 | 1015.63 | 0.86 |
| 2 | 2006-04-01 02:00:00.000 +0200 | Mostly Cloudy | rain | 9.377778 | 3.9284 | 1015.94 | 0.89 |
| 3 | 2006-04-01 03:00:00.000 +0200 | Partly Cloudy | rain | 8.288889 | 14.1036 | 1016.41 | 0.83 |
| 4 | 2006-04-01 04:00:00.000 +0200 | Mostly Cloudy | rain | 8.755556 | 11.0446 | 1016.51 | 0.83 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 96448 | 2016-09-09 19:00:00.000 +0200 | Partly Cloudy | rain | 26.016667 | 10.9963 | 1014.36 | 0.43 |
| 96449 | 2016-09-09 20:00:00.000 +0200 | Partly Cloudy | rain | 24.583333 | 10.0947 | 1015.16 | 0.48 |
| 96450 | 2016-09-09 21:00:00.000 +0200 | Partly Cloudy | rain | 22.038889 | 8.9838 | 1015.66 | 0.56 |
| 96451 | 2016-09-09 22:00:00.000 +0200 | Partly Cloudy | rain | 21.522222 | 10.5294 | 1015.95 | 0.60 |
| 96452 | 2016-09-09 23:00:00.000 +0200 | Partly Cloudy | rain | 20.438889 | 5.8765 | 1016.16 | 0.61 |
96453 rows × 7 columns
df.shape
(96453, 7)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 96453 entries, 0 to 96452 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Formatted Date 96453 non-null object 1 Summary 96453 non-null object 2 Precip Type 95936 non-null object 3 Temperature (C) 96453 non-null float64 4 Wind Speed (km/h) 96453 non-null float64 5 Pressure (millibars) 96453 non-null float64 6 Humidity 96453 non-null float64 dtypes: float64(4), object(3) memory usage: 5.2+ MB
df.describe(include='all')
| Formatted Date | Summary | Precip Type | Temperature (C) | Wind Speed (km/h) | Pressure (millibars) | Humidity | |
|---|---|---|---|---|---|---|---|
| count | 96453 | 96453 | 95936 | 96453.000000 | 96453.000000 | 96453.000000 | 96453.000000 |
| unique | 96429 | 27 | 2 | NaN | NaN | NaN | NaN |
| top | 2010-08-02 00:00:00.000 +0200 | Partly Cloudy | rain | NaN | NaN | NaN | NaN |
| freq | 2 | 31733 | 85224 | NaN | NaN | NaN | NaN |
| mean | NaN | NaN | NaN | 11.932678 | 10.810640 | 1003.235956 | 0.734899 |
| std | NaN | NaN | NaN | 9.551546 | 6.913571 | 116.969906 | 0.195473 |
| min | NaN | NaN | NaN | -21.822222 | 0.000000 | 0.000000 | 0.000000 |
| 25% | NaN | NaN | NaN | 4.688889 | 5.828200 | 1011.900000 | 0.600000 |
| 50% | NaN | NaN | NaN | 12.000000 | 9.965900 | 1016.450000 | 0.780000 |
| 75% | NaN | NaN | NaN | 18.838889 | 14.135800 | 1021.090000 | 0.890000 |
| max | NaN | NaN | NaN | 39.905556 | 63.852600 | 1046.380000 | 1.000000 |
df.isnull().sum()
Formatted Date 0 Summary 0 Precip Type 517 Temperature (C) 0 Wind Speed (km/h) 0 Pressure (millibars) 0 Humidity 0 dtype: int64
df = df.dropna()
df.shape
(95936, 7)
year = pd.to_datetime(df['Formatted Date'], utc = True).dt.year
year
0 2006
1 2006
2 2006
3 2006
4 2006
...
96448 2016
96449 2016
96450 2016
96451 2016
96452 2016
Name: Formatted Date, Length: 95936, dtype: int32
plt.title("Num of Rows per Year")
sns.countplot(x=year)
plt.grid()
#plt.savefig('Num of Rows per Year.jpeg')
df.insert(7, "Year", year)
df
| Formatted Date | Summary | Precip Type | Temperature (C) | Wind Speed (km/h) | Pressure (millibars) | Humidity | Year | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2006-04-01 00:00:00.000 +0200 | Partly Cloudy | rain | 9.472222 | 14.1197 | 1015.13 | 0.89 | 2006 |
| 1 | 2006-04-01 01:00:00.000 +0200 | Partly Cloudy | rain | 9.355556 | 14.2646 | 1015.63 | 0.86 | 2006 |
| 2 | 2006-04-01 02:00:00.000 +0200 | Mostly Cloudy | rain | 9.377778 | 3.9284 | 1015.94 | 0.89 | 2006 |
| 3 | 2006-04-01 03:00:00.000 +0200 | Partly Cloudy | rain | 8.288889 | 14.1036 | 1016.41 | 0.83 | 2006 |
| 4 | 2006-04-01 04:00:00.000 +0200 | Mostly Cloudy | rain | 8.755556 | 11.0446 | 1016.51 | 0.83 | 2006 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 96448 | 2016-09-09 19:00:00.000 +0200 | Partly Cloudy | rain | 26.016667 | 10.9963 | 1014.36 | 0.43 | 2016 |
| 96449 | 2016-09-09 20:00:00.000 +0200 | Partly Cloudy | rain | 24.583333 | 10.0947 | 1015.16 | 0.48 | 2016 |
| 96450 | 2016-09-09 21:00:00.000 +0200 | Partly Cloudy | rain | 22.038889 | 8.9838 | 1015.66 | 0.56 | 2016 |
| 96451 | 2016-09-09 22:00:00.000 +0200 | Partly Cloudy | rain | 21.522222 | 10.5294 | 1015.95 | 0.60 | 2016 |
| 96452 | 2016-09-09 23:00:00.000 +0200 | Partly Cloudy | rain | 20.438889 | 5.8765 | 1016.16 | 0.61 | 2016 |
95936 rows × 8 columns
data = df['Year'].value_counts().reset_index()
data.columns = ['Year', 'Count']
# Create a pie chart
fig = px.pie(data, values='Count', names='Year', hole=0.2)
fig.update_layout(title_text="Distribution of Years", title_x=0.5)
fig.show()
df.hist(bins=100,figsize=(24,20))
plt.show()
features=df.columns[3:]
features
Index(['Temperature (C)', 'Wind Speed (km/h)', 'Pressure (millibars)',
'Humidity', 'Year'],
dtype='object')
#histplot with density estimate line
plt.figure(figsize=(22, 20))
for i, feature in enumerate(features):
plt.subplot(3, 2, i+1)
sns.histplot(df[feature], stat="density", kde=True)
plt.title(f"Distribution of {feature}")
plt.tight_layout()
plt.show()
plt.figure(figsize=(24, 20))
for i, feature in enumerate(features):
plt.subplot(3, 2, i+1)
sns.boxplot(data=df, x=feature)
plt.title(f"{feature}")
plt.tight_layout()
plt.show()
plt.figure (figsize =(10,6), dpi=80)
plt.scatter(df['Temperature (C)'], df['Humidity'], color = '#17becf', s=10)
plt.title ("Humidity & Temperature (C)" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Temperature (C)', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
#plt.savefig('Humidity & Temperature (C).jpeg')
plt.show()
plt.figure (figsize =(10,6), dpi=80)
plt.scatter(df['Wind Speed (km/h)'], df['Humidity'], color = '#17becf', s=10)
plt.title ("Humidity & Wind Speed (km/h)" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Wind Speed (km/h)', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
#plt.savefig('Humidity & Wind Speed.jpeg')
plt.show()
plt.figure (figsize =(10,6), dpi=80)
plt.scatter(df['Pressure (millibars)'], df['Humidity'], color = '#17becf')
plt.title ("Humidity & Pressure (millibars)" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Pressure (millibars)', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
#plt.savefig('Humidity & Pressure (millibars).jpeg')
plt.show()
plt.figure (figsize =(10,6), dpi=80)
plt.scatter(df['Summary'], df['Humidity'], color = '#17becf')
plt.title ("Humidity & Summary of weather" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Summary', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
plt.xticks(rotation = 90)
plt.savefig('Humidity & Summary of weather.png')
plt.show()
plt.figure (figsize =(10,6), dpi=80)
plt.scatter(df['Precip Type'], df['Humidity'], color = '#17becf')
plt.title ("Humidity & Precip Type" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Precip Type', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
plt.xticks(rotation = 90)
#plt.savefig('Humidity & Precip Type.jpeg')
plt.show()
plt.figure (figsize =(10,6), dpi=80)
plt.scatter(df['Temperature (C)'], df['Humidity'], color = '#17becf')
plt.scatter(df['Wind Speed (km/h)'], df['Humidity'], color = '#DE2222')
plt.title ('Temperature (C) & Wind Speed (km/h)', backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Temperature (C)', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.legend(['Temperature (C)', 'Wind speed (km/h)'])
plt.grid()
#plt.savefig('Temperature & Wind Speed.jpeg')
plt.show()